/* File: cln_acquisition_data.do
 * Author: Luca Maini
 * Purpose: creates a master dataset of all deals and acquisitions from SSR and 
 *			SDC and merges it to the main dataset of sales. The datasets are 
 *			acquisitionsClean and ProductLevelSalesSSRwithAcquisition.
 *
 * Date Created: 08/07/2020
 *
 */

/*	The data for this code was created manually, by manually looking for matched
	observations in the SDC and SSR datasets, and then by adding unmatched
	observations from SSR and unmatched observations from SDC that appeared
	relevant to the dataset. These observations are saved in a spreadsheet in
	three different worksheets. We import these worksheets one by one, clean
	them, and append them together. We then merge the resulting dataset to the
	longitudinal panel of drug sales, in order to determine the company that 
	was controlling US marketing for a given drug in each quarter.
*/

/////////////////////////////////////////////////////////////
/////													/////
/////		STEP 0. Get marketing start from SSR		/////
/////													/////
/////////////////////////////////////////////////////////////

use "${outdir}\ProductLevelSalesSSR.dta", clear
keep Product mkt_start
duplicates drop

tempfile startDate
save `startDate', replace


/////////////////////////////////////////////////////////////
/////													/////
/////	STEP 1. Open and append the acquisition data	/////
/////													/////
/////////////////////////////////////////////////////////////

*** Matched observations
import excel "${inputdir}\SDC_SSR_matchlist.xlsx", ///
	sheet("Matched") firstrow clear

* generate match flag
gen match_flag = "both"
	
* combine info in SSR and SDC for the same deal
#delimit ;
local SDCvarList "dateEffective synopsis acquisitionType acquisitionNotes status 
	SharesAcq OwnedAfterTransaction ValueofTransactionMil form dateAnnounced";
#delimit cr

foreach var of local SDCvarList {
	bysort dealID (`var') : replace `var' = `var'[_N] if `var' == ""
	}

* edit company names (eliminate expressions in parentheses that mention full/
* partial rights)
replace TargetName = trim(regexr(TargetName,"\(.*\)",""))
replace AcquirorName = trim(regexr(AcquirorName,"\(.*\)",""))

* Now we have many almost-duplicate observations. Keep only one observation for 
* each deal
drop if Date == ""	// this is the version of the deal from SDC

* drop variables we don't need
drop meshDate year ym

* other edits to variables
destring interest, replace
gen date = date(Date, "DMY")
format date %td
drop Date

* harmonize dates and interest acquired
bysort dealID TargetName AcquirorName Product : egen interest2 = max(interest)
drop interest
rename interest2 interest

bysort dealID TargetName AcquirorName Product : egen date2 = max(date)
drop date
rename date2 date

* Last round of duplicate drops (sometimes SSR Health has more than one 
* observation for the same deal, because one observation is for company A 
* acquiring from company B, and one is for company B divesting to company A).
duplicates drop

* save as a temporary file
tempfile mergers1
save "`mergers1'", replace


*** Unmatched observations from SSR
import excel "${inputdir}\SDC_SSR_matchlist.xlsx", ///
	sheet("Unmatched SSR") firstrow clear

* generate match flag
gen match_flag = "SSR"

* drop unneded variables
drop Links // only for reference
drop dateEffective synopsis status SharesAcq OwnedAfterTransaction form ///
	dateAnnounced	// empty
drop if year == "2006"
drop year meshDate ym // otehrwise unnecessary

* edit company names
replace TargetName = trim(regexr(TargetName,"\(.*\)",""))
replace AcquirorName = trim(regexr(AcquirorName,"\(.*\)",""))

* other edits to variables
destring interest, replace
gen date = date(Date, "DMY")
format date %td
drop Date

* drop duplicate info
bysort dealID Product (interest date) : keep if _n == _N
duplicates drop	// check

* save as a temporary file
tempfile mergers2
save "`mergers2'", replace


*** Unmatched observations from SDC
import excel "${inputdir}\SDC_SSR_matchlist.xlsx", ///
	sheet("Unmatched SDC") firstrow clear allstring

* generate match flag
gen match_flag = "SDC"
	
* Adjust target and acquiror names
foreach var in TargetName AcquirorName {
	replace `var' = trim(upper(`var'))
	}

replace TargetName = substr(TargetName, 1, strpos(TargetName, "-") - 1) ///
		if strpos(TargetName, "-") > 0 & regexm(TargetName, "BRISTOL-MYERS") == 0

* for the BMS case, grab second instance of "-"
replace TargetName = substr(TargetName, 1, ///
	strpos(subinstr(TargetName, "BRISTOL-MYERS", "BRISTOL MYERS", 1), "-") - 1) ///
		if strpos(TargetName, "-") > 0 & regexm(TargetName, "BRISTOL-MYERS") == 1

* Cosmetic edits to make the variables match to the other two datasets we just 
* imported
destring dealID, replace
gen date = date(dateEffective, "MDY")
format date %td

*** Append all datasets together
append using "`mergers1'"
append using "`mergers2'"

* get quarter of acquisition
gen acquisitionQrt = qofd(date + 2)	// some deals happen on last day of quarter
format acquisitionQrt %tq

* "clean" company names to harmonize them with SSR (mostly small cosmetic changes)
gen quarter = acquisitionQrt
rename AcquirorName companyName
do "${dodir}\support_scripts\clean_company_name.do"
rename companyName AcquirorName 

rename TargetName companyName
do "${dodir}\support_scripts\clean_company_name.do"
rename companyName TargetName 
drop quarter

* drop products acquired past their LOE date
gen temp = date(LOE, "DMY")
drop LOE
rename temp LOE
format LOE %td
drop if acquisitionQrt >= qofd(LOE)

* calculate number of products involved in each deal
bysort dealID : gen num_products = _N
label var num_products "Number of acquired products"

* destring value of deal
destring ValueofTransactionMil, replace

* Generate a broader category for each deal
gen dealCategory = "COMPANY ACQUISITION" ///
	if inlist(acquisitionType, "Acquisition of entire company", ///
							   "Acquisition of major interest", ///
							   "Merger")

replace dealCategory = "LARGE PRODUCT ACQUISITION" ///
	if inlist(acquisitionType, "Acquisition of business line", ///
							   "Acquisition of multiple products", ///
							   "Acquisition of multiple products and minority stake", ///
							   "Acquisition of entire subsidiary") | ///
		acquisitionType == "Acquisition of marketing rights" & ///
			num_products > 1


replace dealCategory = "SINGLE PRODUCT ACQUISITION" ///
	if inlist(acquisitionType, "Acquisition of single product", ///
							   "Acquisition of single product and minority stake") | ///
		acquisitionType == "Acquisition of marketing rights" & ///
			num_products == 1

replace dealCategory = "JOINT VENTURE" ///
	if inlist(acquisitionType, "Co-marketing agreement", ///
							   "Co-promotion agreement")

replace dealCategory = "DIVESTITURE" ///
	if inlist(acquisitionType, "Cessation of marketing activity", ///
							   "Divestiture of US marketing rights", ///
							   "Termination of existing co-promotion agreement", ///
							   "Termination of existing licensing agreement", ///
							   "Termination of existing marketing agreement", ///
							   "Termination of joint venture", ///
							   "Termination of partnership following LOE")

replace dealCategory = "FINANCIAL DEAL" ///
	if inlist(acquisitionType, "Acquisition of minority stake", ///
							   "Acquisition of remaining stake", ///
							   "Acquisition of royalties and payment rights", ///
							   "Sale of royalties", ///
							   "Termination of royalty flow")

replace dealCategory = "SPINOFF" ///
	if inlist(acquisitionType, "Spinoff from existing company")

replace dealCategory = "MISSING" if acquisitionType == ""

*** EXCEPTIONS: the following deals are further edited on an ad-hoc basis:

* dealID 215: 	Warner Chilcott acquires full rights of Enablex from Novartis. 
*				Prior to this, the two companies were co-marketing the product.
*				While it seems like Novartis played an active role in this, so
*				did WC, so we categorize this as a financial deal
replace dealCategory = "FINANCIAL DEAL" if dealID == 215

* dealID 18:	Lilly acquired IMClone, a company who marketed its drug Erbitux
*				together with BMS. However, Lilly did not acquire the rights to
*				Erbitux until later (see dealID 269), so we reclassify this deal
*				as a financial deal.
replace dealCategory = "FINANCIAL DEAL" if dealID == 18

* dealID 271:	Merck KGAA acquired full rights to REBIF from Pfizer. Prior to
*				this, the two companies had a co-marketing agreement whose terms
*				we could not figure out. Since Merck was already recording net
*				sales for REBIF prior to this change, we reclassify this as a
*				financial deal.
replace dealCategory = "FINANCIAL DEAL" if dealID == 271 

* dealID 75: 	Bayer acquired ALGETA through a subsidiary, but they already had
*				a marketing agreement on the only marketed drug, XOFIGO.
replace dealCategory = "FINANCIAL DEAL" if dealID == 75 

* save dataset for reference
save "${outdir}\acquisitionsClean.dta", replace

* drop deals that do not matter for product ownership or that are not real 
* acquisitions (such as spinoffs)
drop if inlist(dealCategory, "FINANCIAL DEAL", "JOINT VENTURE", "MISSING", "SPINOFF")

* Finally, we identify deals that occurred PRIOR to the marketing start of a 
* product (these are a handful of deals when a product was approved, or about to
* be approved, and its marketing rights were given up). To do so, we merge with 
* SSR data on market_start
merge m:1 Product using `startDate', keep(master match) nogen

* we drop Products that were acquired in the quarter prior to launch or earlier
drop if acquisitionQrt < qofd(mkt_start)
drop mkt_start

* save as a separate dataset to avoid missing the selection step above in 
* subsequent analysis
save "${maindir}\acquisitionsClean_selected.dta", replace


* Drop variables we don't need
keep TargetName AcquirorName Product acquisitionQrt dealCategory dealID

* reshape wide to get a dataset that is uniquely identified by Product
bysort Product (acquisitionQrt) : gen n = _n
reshape wide TargetName AcquirorName dealCategory acquisitionQrt dealID, ///
	i(Product) j(n)


* merge on to the main SSR Health data (the only product we would not match are 
* vaccines, which are excluded from this analysis)
merge 1:m Product using "${outdir}\ProductLevelSalesSSR.dta", ///
	keep(using match) nogen

format quarter acquisitionQrt* %tq

* Now create the "controlling company variable"
gen mktCompany = Company if acquisitionQrt1 == .	// this for products that don't have deals

replace mktCompany = TargetName1 ///
	if quarter < acquisitionQrt1 & acquisitionQrt1 != . & mktCompany == ""

replace mktCompany = AcquirorName1 ///
	if quarter >= acquisitionQrt1 & ///
	   quarter < acquisitionQrt2 & acquisitionQrt1 != . & ///
	   mktCompany == "" & dealCategory1 != "JOINT VENTURE"

replace mktCompany = TargetName1 + "; " + AcquirorName1 ///
	if quarter >= acquisitionQrt1 & ///
	   quarter < acquisitionQrt2 & acquisitionQrt1 != . & ///
	   mktCompany == "" & dealCategory1 == "JOINT VENTURE"

replace mktCompany = AcquirorName2 ///
	if quarter >= acquisitionQrt2 & ///
	   quarter < acquisitionQrt3 & acquisitionQrt2 != . & mktCompany == ""

replace mktCompany = AcquirorName3 ///
	if quarter >= acquisitionQrt3 & ///
	   quarter < acquisitionQrt4 & acquisitionQrt3 != . & mktCompany == ""

replace mktCompany = AcquirorName4 ///
	if quarter >= acquisitionQrt4 & acquisitionQrt4 != . & mktCompany == ""

* run through company name edits again
rename mktCompany companyName
do "${dodir}\support_scripts\clean_company_name.do"
rename companyName mktCompany 

	* drop variables that are no longer needed
drop TargetName* AcquirorName* Company

* Save dataset
save "${outdir}\ProductLevelSalesSSRwithAcquisition.dta", replace
